#from sklearn import datasets
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

import pandas as pd
from pathlib import Path
from sklearn.linear_model import Lasso

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

from imblearn.over_sampling import RandomOverSampler

import operator as op
import math

from shutil import copyfile


import numpy as np
import os
import sys

import csv
import zipfile


import plotting_rocs_monthly_JEEA

#read user input
arguments = sys.argv



tops = int(arguments[1])
tokens = arguments[2]
cutter = arguments[3]
kick_weird = arguments[4]
where = str(arguments[5])
model_type = str(arguments[6])
longer = int(arguments[7])
comp = str(arguments[8])

spitout = comp
readin = spitout

 
zip_file =  readin + 'merged_' + tokens+'_topics'+str(tops)+'_'+str(cutter)+'/'

if os.path.exists(zip_file) == False:
	the_thang = 'merged_' + tokens+'_topics'+str(tops)+'_'+str(cutter)+ '.zip'
	try:
		print('start unzipping zip file of merged input')
		with zipfile.ZipFile(the_thang) as z:
			#z.extractall(readin + 'merged_' + tokens+'_topics'+str(tops)+'_'+str(cutter)+'/')
			z.extractall(readin)
		print("Extracted all zip file of merged input")
	except:
		print("Invalid zip file of merged input")




clf3 = GaussianNB()
clf4 = Lasso(alpha=0.00001,max_iter=10000)


lr = LogisticRegression()


#collect all dependent variables here so they can be passed on to stata
depos = []

#



#deps = ['anyviolence', 'civilwar', 'armedconf']

if longer != 2:
	deps = ['anyviolence', 'armedconf']

else:
	deps = ['ged_best_ns', 'ged_best_os', 'ged_best_sb']


#here I take these do files and copy them on to my harddisk due to annoying blank problem
#copyfile(readin+'/ensemble_monthly_JEEA.do', user_place + '/ensemble_monthly_JEEA.do')
#copyfile(readin+'/add_commodity_prices_JEEA.do', user_place + '/add_commodity_prices_JEEA.do')




ICEWS_short = ['protests_icews', 'intermediate', 'gov_opp_cameo', 'gov_cameo']
	
icews_events = ['gov_events_', 'events_']
for d in range(1,21):
	for i in icews_events:

		ICEWS_short.append(i+str(d))
	



commodity_now = 0
#conf_types = ['','ons_']
#nummbers
#0 text
#1 dummies
#2 mine
#3 goldstone + dummies
#4 ICEWS + dummies
#5 mine + goldstone + dummies
#6 mine + ICEWS + dummies
#7 text + dummies (ex_post) 'text_dummies'
if longer == 1:
	name_inputs = ['text', 'dummies', 'mine','silverstone', 'mine_gold', 'commodity', 'commodity_alone', 'commodity_short', 'commodity_short_alone', 'commodity_region', 'commodity_region_alone']	
	commodity_now = 1
elif model_type[-7:] == 'single_':
	name_inputs = ['neighbor', 'Logit', 'neural', 'adaboost', 'randomforest']

else:	
	name_inputs = ['text', 'dummies','mine']



these = ['isocode', 'year', 'quarter']

suffix = ''
if model_type == 'upsample_':
	suffix = 'upsample_'

print('model type last seven', model_type[-7:])
print('model type ', model_type)



scorers = ['roc_auc']
if model_type[-8:] == 'myscore_':
	score_with = scorers[1]
	print('You are scoring with your own costs')
else:
	score_with = scorers[0]
	print('You are scoring with AUC')

	



def in_out(X, which, depo, futuro):

	
	hurz = []

	for ei in which:
		hurz.append(ei)
	hurz.append('isocode')

	#create variables and country list for prediction into the future 
	future_ready = futuro[hurz]
	future_ready = future_ready.dropna(how='any')
	countries_ready = future_ready[['isocode']]
	future_ready = future_ready.drop('isocode', axis=1)

	#create variables and outcomes to train model on past data 
	hurz = []
	for ei in which:
		hurz.append(ei)
	hurz.append(depo)
	X = X[hurz]
	X = X.dropna(how='any')							
	df2 = X[[depo]]
	X = X[which]
	y = df2.values.ravel()

	#scale everything (required for neural network)
	scaler = StandardScaler()
	scaler.fit(X)
	haurein = scaler.transform(X)
	if len(future_ready) > 0:
		hauraus = scaler.transform(future_ready)
	else:
		hauraus = []


	return haurein, y, hauraus, countries_ready, scaler






text1 = []


for z in range(0,tops):
	this = 'ste_theta' + str(z)	
	text1.append(this)
	this = 'ste_theta' + str(z) + '_stock'
	text1.append(this)
	#text1_small.append(this)
this = 'tokens'
text1.append(this)
print('You are adding log tokens')

#if model_type[-7:] != 'single_':	
	#go_through = [1,3,12]
#	go_through = [1]
if tops == 30 and longer == 0:
	go_through = [1,3,12]
else:
	go_through = [1]

for g in go_through:



	text_file = open(spitout + 'chosenmodels_long_' + where + model_type + '_' +tokens + '_' +str(cutter)+'_topics'+str(tops)+'_'+str(longer)+'_'+str(g)+'.txt', 'w')
	text_file.write('Model parameters chosen \n')
	text_file.close()




	for dup in deps:

		if longer == 0 and g == 1:
			conf_types = ['ons_','']
		else:
			conf_types = ['ons_']

		for conf_type in conf_types:

			
			dep = conf_type + dup + str(g)

			print(dep)


			gold = ['childmortality','democracy0','democracy1','democracy2','democracy3','democracy4','democracy5']			
			commodity_alone = ['minerals','oil','agricultural', 'totalcomdata']
			commodity_region_alone = commodity_alone + ['where1','where2','where3','where4','where5','where6']			
			commodity_short_alone = ['totalcomdata']


			dummies = []

			
			dummies = ['anyviolence_dp', 'armedconf_dp', 'civilwar_dp']
			if longer == 2:
				dummies = ['ged_best_ns_dp', 'ged_best_os_dp', 'ged_best_sb_dp', 'armedconf_dp', 'civilwar_dp']
			elif dup != 'anyviolence' or conf_type == '':
				dummies.append('best')

			

			if dep == 'ons_armedconf'  + str(g) or dep == 'ons_civilwar'  + str(g):
				
				gold.append('contig_anyviolence')
				
			else:
				gold.append('contig_anyviolence')
				


			

			texters = [text1]

			mores = [model_type]



			m = -1

			for model in mores:


				
				print(model)


				runs = -1

				for text in texters:



					first_round = 1
					
					if text == text1:
						runs = 1
					

					print('runs ', runs) 

					more = []

					

					mine = text + dummies
					
					ICEWS = ICEWS_short 
					goldstone = gold + dummies
					silverstone = gold
					commodity = mine + commodity_alone 
					commodity_short = mine + commodity_short_alone
					commodity_region = mine + commodity_region_alone


					mine_ICEWS = mine + ICEWS_short
					mine_gold  = mine + gold

					#nummbers
					#0 text
					#1 dummies
					#2 mine
					#3 goldstone + dummies
					#4 ICEWS + dummies
					#5 mine + goldstone + dummies
					#6 ICEWS + goldstone + dummies
					inputter = []
					if model[-7:] != 'single_':	

								
						
						inputter.append(text)
						inputter.append(dummies)
						inputter.append(mine)
						
						if longer == 1:
							
							inputter.append(silverstone)
							inputter.append(mine_gold)
							
							
							inputter.append(commodity)
							inputter.append(commodity_alone)

							inputter.append(commodity_short)
							inputter.append(commodity_short_alone)

							inputter.append(commodity_region)
							inputter.append(commodity_region_alone)	

							
						
					#when single then I am only using one set of predictors (e.g. text)
					else:
						print('schrott')
						print(model[0:len(model)-7]) 

						inputter.append(eval(model[0:len(model)-7]))


					obs = 0


					for i in range(0,len(inputter)):
						print(inputter[i])
						print('now looping over years')

						first_round = 1
						first_year = 2005
						
						for year in range(first_year,2021):
							#only train model for January of each year						
							m = 1
													
							my_file = Path(spitout + 'merged_' + tokens+"_topics"+str(tops)+'_'+cutter+ '/merged_' + tokens+"_topics"+str(tops)+'_'+cutter+"_m"+str(m)+"_y"+str(year)+".csv")
							#print(my_file)
							#sdhskdh
							if my_file.is_file():

								if commodity_now == 0:
									data = pd.read_csv(my_file, sep=',')
									
								else:
									#add commodity prices
									os.system('/Applications/Stata/StataMP.app/Contents/MacOS/stata-mp -b do '+str(readin +'add_commodity_prices_JEEA.do')+' '+str('merged_' + tokens+"_topics"+str(tops)+'_'+cutter+'/merged_' + tokens+"_topics"+str(tops)+'_'+cutter+"_m"+str(m)+"_y"+str(year)+".csv")+' '+comp)
									data = pd.read_csv(spitout + 'temp.csv', sep=',')


								future = data.drop(data[(data.year != year) | (data.month != m)].index)			
								data = data.drop(data[(data.year == year) & (data.month == m)].index)
								data = data.drop(data[(data.year > year)].index)
								data = data.drop(data[(data.year == year) & (data.month > m)].index)


								


								[X, Y, future_out, countries, scalers] = in_out(data, inputter[i], dep, future)

								print('obs ', len(Y))

								if first_round == 1:


									print('entered!')

									first_round = 0


									if model[-7:] == 'single_':

										clf = DecisionTreeClassifier(criterion='gini', max_depth=1, random_state = 2502)
										svc = AdaBoostClassifier(base_estimator=clf) 								
										pars = []
										for p in range(1,100,2):
										
											pars.append(p)
										tuned_parameters = [{'n_estimators': pars,}]
										
										clfx = GridSearchCV(svc, tuned_parameters, cv=3,scoring=score_with)
										clfx.fit(X, Y)


										print("Best parameters set found on development set:")
										print()
										print(clfx.best_params_)
										print("AUC :",clfx.best_score_)
										print()
										
										hei = name_inputs[i] + ' AdaBoost ' + dep + ' ' + str(clfx.best_params_) + '\n'
										text_file = open(spitout + 'chosenmodels_long_' + where + model_type + '_' + tokens + '_' +str(cutter)+'_topics'+str(tops)+'_'+str(longer)+'_'+str(g)+'.txt', 'a')
										text_file.write(hei)
										text_file.close()

										clf3 = clfx.best_estimator_

										svc = KNeighborsClassifier()
										
										pars = []
										for p in range(700,1000,50):										
											pars.append(p)
										tuned_parameters = [{'n_neighbors': pars,}]
										
										
										clfx = GridSearchCV(svc, tuned_parameters, cv=3, scoring=score_with)
										clfx.fit(X, Y)

										print("Best parameters set found on development set:")
										print()
										print(clfx.best_params_)
										print("AUC :",clfx.best_score_)
										print()
										
										hei = name_inputs[i] + ' Neighbors ' + dep + ' ' + str(clfx.best_params_) + '\n'
										text_file = open(spitout + 'chosenmodels_long_' + where + model_type + '_' + tokens + '_' +str(cutter)+'_topics'+str(tops)+'_'+str(longer)+'_'+str(g)+'.txt', 'a')
										text_file.write(hei)
										text_file.close()
			    						

										clf0 = clfx.best_estimator_


										#choose best neural net
										#no need to choose learning_rate with lbfgs
										#'activation': ["logistic", "relu", "tanh", "identity"]
										#activation: logistic sucks, relu and tanh take too long (and perform worse than identity)
										#tried other cvs (3 best)
										svc = MLPClassifier(solver='lbfgs', random_state=2520, max_iter=100000, activation='identity')
										tuned_parameters = [{'hidden_layer_sizes': [(2,2), (2,2,2), (7,7,7), (5,5,5),(2,5,7), (7,5,2),(10,10,10), (20,20,20), (5,50,50),(7,150,100),(7,150),(5,100,20),(7,100,100)],'alpha': [0.001,0.0001,0.00001,0.000001,0.0000001],}]
										
										clfx = GridSearchCV(svc, tuned_parameters, cv=3, scoring=score_with)
										clfx.fit(X, Y)

										print("Best parameters set found on development set:")
										print()
										print(clfx.best_params_)
										print("AUC :",clfx.best_score_)
										print()
										
										hei = name_inputs[i] + ' Neural ' + dep + ' ' + str(clfx.best_params_) + '\n'
										text_file = open(spitout + 'chosenmodels_long_' + where + model_type + '_' + tokens + '_' +str(cutter)+'_topics'+str(tops)+'_'+str(longer)+'_'+str(g)+'.txt', 'a')
										text_file.write(hei)
										text_file.close()
			    						

										clf2 = clfx.best_estimator_


										logreg=LogisticRegression(random_state=2520, max_iter=10000, solver='liblinear')


										grid={"C":np.logspace(-3,-1,7), "penalty":["l1","l2"]}# l1 lasso l2 ridge
										
										
										logreg_cv=GridSearchCV(logreg,grid, cv=3,scoring=score_with)
										logreg_cv.fit(X, Y)

										print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
										print("AUC :",logreg_cv.best_score_)

					

										print("Best parameters set found on development set:")
										print()
										print(logreg_cv.best_params_)
										print()
										
										hei = name_inputs[i] + ' Logit ' + dep + ' ' + str(logreg_cv.best_params_) + '\n'
										text_file = open(spitout + 'chosenmodels_long_' + where + model_type + '_' + tokens + '_' +str(cutter)+'_topics'+str(tops)+'_'+str(longer)+'_'+str(g)+'.txt', 'a')
										text_file.write(hei)
										text_file.close()

										clf1 = logreg_cv.best_estimator_

							
									svc = RandomForestClassifier(random_state=2520)
							
									if g < 12:
										tuned_parameters = [{'max_depth': [3,7,8,10],'n_estimators': [200,400,500,600,700],'min_samples_split': [6, 12, 18],'max_features': ['auto'],'min_samples_leaf': [3,6],}]
										#tuned_parameters = [{'max_depth': [2],'n_estimators': [10],'min_samples_split': [6],'max_features': ['auto'],'min_samples_leaf': [3],}]
										
									else:									
										tuned_parameters = [{'max_depth': [3,7,8,10],'n_estimators': [200,400,500,600,700],'min_samples_split': [24,36],'max_features': ['auto'],'min_samples_leaf': [12],}]
									
									clfx = GridSearchCV(svc, tuned_parameters, cv=3,scoring=score_with)
									clfx.fit(X, Y)

									print("Best parameters set found on development set:")
									print()
									print(clfx.best_params_)
									print("AUC :",clfx.best_score_)
									print()
									
									hei = name_inputs[i] + ' Forest ' + dep + ' ' + str(clfx.best_params_) + '\n'
									text_file = open(spitout + 'chosenmodels_long_' + where + model_type + '_' + tokens + '_' +str(cutter)+'_topics'+str(tops)+'_'+str(longer)+'_'+str(g)+'.txt', 'a')
									text_file.write(hei)
									text_file.close()

									
									clf5 = RandomForestClassifier(max_depth=clfx.best_params_['max_depth'],n_estimators =clfx.best_params_['n_estimators'],min_samples_split =clfx.best_params_['min_samples_split'],max_features=clfx.best_params_['max_features'],min_samples_leaf=clfx.best_params_['min_samples_leaf'], random_state=2520)


									if model[-7:] == 'single_':	
										zipper = [clf5, clf0, clf1, clf2, clf3]
										labels = ['RandomForest','Neighbor', 'Logit', 'Neural', 'AdaBoost']
									
									elif model[-7:] != 'single_':		
										zipper = [clf5]
										labels = ['RandomForest']


								if len(X) > 0 and len(Y) > 0:

									if model_type[-9:] == 'upsample_':
										ros = RandomOverSampler()
										print('pre oversampling ', len(X))
										reweight = sum(Y)
										print('preweight ', reweight)
										X, Y = ros.fit_sample(X, Y)
										print('post oversampling ', len(X))

										reweight = reweight/sum(Y)
										print('reweight ', reweight)
						
									
									#now loop over models
									for clf, label in zip(zipper,labels): 

										if first_year == year:
											scores = cross_val_score(clf, X, Y, cv=3, scoring=score_with)
											print(str(year), " ", dup, " ", name_inputs[i], " AUC: %0.3f (+/- %0.3f) [%s]" % (scores.mean(), scores.std(), label))
										clf.fit(X, Y)

										for m in range(1, 13):
											my_file = Path(spitout + 'merged_' + tokens+"_topics"+str(tops)+'_'+cutter+ '/merged_' + tokens+"_topics"+str(tops)+'_'+cutter+"_m"+str(m)+"_y"+str(year)+".csv")
											if my_file.is_file():
												print(year, 'm', m)
												if commodity_now == 0:
													data = pd.read_csv(my_file, sep=',')

												else:
													#add commodity prices
													os.system('/Applications/Stata/StataMP.app/Contents/MacOS/stata-mp -b do '+str(readin +'add_commodity_prices_JEEA.do')+' '+' '+str('merged_' + tokens+"_topics"+str(tops)+'_'+cutter+'/merged_' + tokens+"_topics"+str(tops)+'_'+cutter+"_m"+str(m)+"_y"+str(year)+".csv")+' '+comp)
													data = pd.read_csv(spitout + 'temp.csv', sep=',')

												future = data.drop(data[(data.year != year) | (data.month != m)].index)															


												[X, Y, future_out, countries, scalers] = in_out(data, inputter[i], dep, future)
												if len(future_out) > 0: 
													res = clf.predict_proba(future_out)
													if model_type[-9:] == 'upsample_':
														res = res * reweight
												else:
													res = []
													print('only missings')
													
												if len(res) > 0:
													
													cups = countries
													bla = pd.DataFrame(res[:, 1])
													ei = cups['isocode'].values.tolist()
													bla = bla.assign(isocode= ei)
													if model[-7:] != 'single_':	
														bla.to_csv(spitout + 'temp/' + name_inputs[i] + where + label  +tokens + "_" +dep+str(cutter)+"_topics"+str(tops)+"_m"+str(m)+"_y"+str(year)+"_g"+str(g)+".csv")
													else:
														bla.to_csv(spitout + 'temp/' + label + where + suffix + model + tokens + "_" +dep+str(cutter)+"_topics"+str(tops)+"_m"+str(m)+"_y"+str(year)+"_g"+str(g)+".csv")

											else:
												print('didnt find file ', my_file)

													
			#collecting all dependent variables
			depos.append(dep)
			#summarizing this dependent variable
			print('Appending results for this dependent variable in Stata: ', dep)
			os.system('/Applications/Stata/StataMP.app/Contents/MacOS/stata-mp -b do  '+str(readin +'ensemble_monthly_JEEA.do')+' '+str(tops)+' '+tokens+' '+ str(cutter)+' '+ str(kick_weird)+' '+ str(where)+' '+ model_type+' '+str(longer)+' '+str(g)+' '+spitout+' '+'0'+' '+dep)   

	print('Appending full results in Stata: ', depos)
	deposx = ''
	for d in depos:
		deposx = deposx + '?' + d
	os.system('/Applications/Stata/StataMP.app/Contents/MacOS/stata-mp -b do   '+str(readin +'ensemble_monthly_JEEA.do')+' '+str(tops)+' '+tokens+' '+ str(cutter)+' '+ str(kick_weird)+' '+ str(where)+' '+ model_type+' '+str(longer)+' '+str(g)+' '+spitout+' '+'1'+' '+deposx)
	plotting_rocs_monthly_JEEA.plotting_rocs(tops, tokens,cutter, kick_weird, where, model_type, longer, g, comp, deps)
	#plotting_separation_monthly20200628.separating_dem(tops, tokens,cutter, kick_weird, where, model_type, longer, g)

										

#for year in range(2010,2022):
#	for m in range(1, 13):					
#		my_file = Path(spitout + model_type+tokens+"_topics"+str(tops)+'_'+cutter+"_m"+str(m)+"_y"+str(year)+".csv")
#		if my_file.is_file():
			#os.remove(my_file)
#			hola = 1
	



